{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Enron Dataset Preprocessing : Parsing the raw dataset\n",
    "\n",
    "In this notebook, we parse the raw dataset in to a single csv file."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1: Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import sys\n",
    "import email\n",
    "import dateutil\n",
    "\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2: Download and extract the dataset\n",
    "\n",
    "The Enron dataset should be downloaded from `https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz` and extracted in the current directory.\n",
    "\n",
    "The following variable defines the location to the raw dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "MAIL_DIR = 'maildir/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 3: Parse the dataset"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We first define a couple helper functions to extract the emails sent by all individuals in the dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def recursive_listdir(path):\n",
    "    \"\"\"Recursively walk from a given path\"\"\"\n",
    "    return [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(path)) for f in fn]\n",
    "\n",
    "def get_sender_emails(user):\n",
    "    \"\"\"Generator that iterates over all the emails sent by a given user\"\"\"\n",
    "    # Check all all sub-folders in the user folder\n",
    "    for fpath in recursive_listdir(os.path.join(MAIL_DIR, user)):\n",
    "        # Ignore os specific file\n",
    "        if fpath.endswith('.DS_Store'):\n",
    "            continue\n",
    "        # Read email file\n",
    "        with open(fpath, 'rb') as f:\n",
    "            msg = email.message_from_binary_file(f)\n",
    "        # Parse date\n",
    "        date = msg['Date']\n",
    "        dt = dateutil.parser.parse(date)\n",
    "        t = dt.timestamp()\n",
    "        # Store data in a dict\n",
    "        mail_dict = {\n",
    "            'user': user,\n",
    "            'date': date,\n",
    "            'timestamp': t, \n",
    "            'file': re.sub(MAIL_DIR, '', fpath), \n",
    "        }\n",
    "        for key in ['From', 'To', 'Cc', 'Bcc', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Origin']:\n",
    "            mail_dict[key] = msg.get(key)\n",
    "\n",
    "        yield mail_dict"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Get all emails from all individuals."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the list of individuals\n",
    "SENDER_LIST = sorted([d for d in os.listdir(MAIL_DIR) if os.path.isdir(os.path.join(MAIL_DIR, d))])\n",
    "\n",
    "data = list()\n",
    "n_senders = len(SENDER_LIST)\n",
    "for i, sender in enumerate(SENDER_LIST):\n",
    "    print('{:d}/{:d} - Process user: {:<20s}'.format(i+1, n_senders, sender), end='\\r', flush=True)\n",
    "    data.extend(list(get_sender_emails(sender)))\n",
    "print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Visualize some mails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame.from_dict(data)\n",
    "df.sample(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Save dataframe to `csv`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('enron_dataset_raw.csv', encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
